import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import ceil
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import plotly.express as px
import plotly.graph_objects as go
import plotly
After Business Understanding, our goal is to start to be familiar with the data. Here, it's possible to understand some possible patterns, first insights that could be useful for the next phases. We also visualize the target proportion in some variables.
data = pd.read_csv ('H2.csv')
data
| IsCanceled | LeadTime | ArrivalDateYear | ArrivalDateMonth | ArrivalDateWeekNumber | ArrivalDateDayOfMonth | StaysInWeekendNights | StaysInWeekNights | Adults | Children | ... | DepositType | Agent | Company | DaysInWaitingList | CustomerType | ADR | RequiredCarParkingSpaces | TotalOfSpecialRequests | ReservationStatus | ReservationStatusDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 6 | 2015 | July | 27 | 1 | 0 | 2 | 1 | 0.0 | ... | No Deposit | 6 | NULL | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-03 |
| 1 | 1 | 88 | 2015 | July | 27 | 1 | 0 | 4 | 2 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient | 76.50 | 0 | 1 | Canceled | 2015-07-01 |
| 2 | 1 | 65 | 2015 | July | 27 | 1 | 0 | 4 | 1 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient | 68.00 | 0 | 1 | Canceled | 2015-04-30 |
| 3 | 1 | 92 | 2015 | July | 27 | 1 | 2 | 4 | 2 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient | 76.50 | 0 | 2 | Canceled | 2015-06-23 |
| 4 | 1 | 100 | 2015 | July | 27 | 2 | 0 | 2 | 2 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient | 76.50 | 0 | 1 | Canceled | 2015-04-02 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 79325 | 0 | 23 | 2017 | August | 35 | 30 | 2 | 5 | 2 | 0.0 | ... | No Deposit | 394 | NULL | 0 | Transient | 96.14 | 0 | 0 | Check-Out | 2017-09-06 |
| 79326 | 0 | 102 | 2017 | August | 35 | 31 | 2 | 5 | 3 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient | 225.43 | 0 | 2 | Check-Out | 2017-09-07 |
| 79327 | 0 | 34 | 2017 | August | 35 | 31 | 2 | 5 | 2 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient | 157.71 | 0 | 4 | Check-Out | 2017-09-07 |
| 79328 | 0 | 109 | 2017 | August | 35 | 31 | 2 | 5 | 2 | 0.0 | ... | No Deposit | 89 | NULL | 0 | Transient | 104.40 | 0 | 0 | Check-Out | 2017-09-07 |
| 79329 | 0 | 205 | 2017 | August | 35 | 29 | 2 | 7 | 2 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient | 151.20 | 0 | 2 | Check-Out | 2017-09-07 |
79330 rows × 31 columns
data.columns
Index(['IsCanceled', 'LeadTime', 'ArrivalDateYear', 'ArrivalDateMonth',
'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children',
'Babies', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel',
'IsRepeatedGuest', 'PreviousCancellations',
'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType',
'BookingChanges', 'DepositType', 'Agent', 'Company',
'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces',
'TotalOfSpecialRequests', 'ReservationStatus', 'ReservationStatusDate'],
dtype='object')
data['IsCanceled'].value_counts()
0 46228 1 33102 Name: IsCanceled, dtype: int64
((data['Adults'] == 0) & (data['Children'] == 0) & (data['Babies'] == 0)).sum()
#we need to drop them since it doesn't count as bookings
167
(data['RequiredCarParkingSpaces']).value_counts()
#the majority of the customers doesn't require car parking spaces
0 77404 1 1921 2 3 3 2 Name: RequiredCarParkingSpaces, dtype: int64
(data['Meal']).value_counts()
#the majority of the customers tend to buy BB meal package
BB 62305 SC 10564 HB 6417 FB 44 Name: Meal, dtype: int64
(data['BookingChanges']).value_counts()
#in the majority of the bookings, there are no changes during the process
0 69062 1 7232 2 2244 3 467 4 194 5 46 6 31 7 19 8 9 14 5 9 4 13 3 15 3 10 3 11 2 20 1 17 1 21 1 12 1 18 1 16 1 Name: BookingChanges, dtype: int64
(data['IsRepeatedGuest']).value_counts()
#thisis the first time in the hotel for the majority of the customers
0 77298 1 2032 Name: IsRepeatedGuest, dtype: int64
data['MarketSegment'].value_counts()
#The main booking market segment is Online Travel Agencies (TA)
Online TA 38748 Offline TA/TO 16747 Groups 13975 Direct 6093 Corporate 2986 Complementary 542 Aviation 237 Undefined 2 Name: MarketSegment, dtype: int64
(data['DistributionChannel']).value_counts()
#The main booking distribution channels are Travel Agents (TA) and Tour Operators (TO)
TA/TO 68945 Direct 6780 Corporate 3408 GDS 193 Undefined 4 Name: DistributionChannel, dtype: int64
(data['Company']).value_counts().head(15) #high cardinality
NULL 75641
40 924
67 267
45 250
153 215
219 141
233 114
174 113
51 86
242 61
348 59
38 51
91 48
280 48
197 47
Name: Company, dtype: int64
(data['CustomerType']).value_counts()
Transient 59404 Transient-Party 17333 Contract 2300 Group 293 Name: CustomerType, dtype: int64
(data['TotalOfSpecialRequests']).value_counts()
0 47957 1 21420 2 8142 3 1587 4 198 5 26 Name: TotalOfSpecialRequests, dtype: int64
(data['Country']).value_counts().head(15) #high cardinality
PRT 30960 FRA 8804 DEU 6084 GBR 5315 ESP 4611 ITA 3307 BEL 1894 BRA 1794 USA 1618 NLD 1590 CHE 1295 IRL 1209 AUT 1053 CHN 865 SWE 720 Name: Country, dtype: int64
(data['PreviousCancellations']).value_counts()
#The majority of the customers doesn't register any previous cancellations
0 73941 1 5155 2 72 3 51 11 35 4 25 6 22 5 16 13 12 21 1 Name: PreviousCancellations, dtype: int64
(data['LeadTime']).value_counts()
0 3109
1 1865
2 1130
4 1052
3 1022
...
387 1
366 1
458 1
347 1
463 1
Name: LeadTime, Length: 453, dtype: int64
(data['ADR']).value_counts()
62.00 3593
75.00 2372
90.00 2208
65.00 2057
95.00 1501
...
156.70 1
119.38 1
134.47 1
105.31 1
157.71 1
Name: ADR, Length: 5405, dtype: int64
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 79330 entries, 0 to 79329 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 IsCanceled 79330 non-null int64 1 LeadTime 79330 non-null int64 2 ArrivalDateYear 79330 non-null int64 3 ArrivalDateMonth 79330 non-null object 4 ArrivalDateWeekNumber 79330 non-null int64 5 ArrivalDateDayOfMonth 79330 non-null int64 6 StaysInWeekendNights 79330 non-null int64 7 StaysInWeekNights 79330 non-null int64 8 Adults 79330 non-null int64 9 Children 79326 non-null float64 10 Babies 79330 non-null int64 11 Meal 79330 non-null object 12 Country 79306 non-null object 13 MarketSegment 79330 non-null object 14 DistributionChannel 79330 non-null object 15 IsRepeatedGuest 79330 non-null int64 16 PreviousCancellations 79330 non-null int64 17 PreviousBookingsNotCanceled 79330 non-null int64 18 ReservedRoomType 79330 non-null object 19 AssignedRoomType 79330 non-null object 20 BookingChanges 79330 non-null int64 21 DepositType 79330 non-null object 22 Agent 79330 non-null object 23 Company 79330 non-null object 24 DaysInWaitingList 79330 non-null int64 25 CustomerType 79330 non-null object 26 ADR 79330 non-null float64 27 RequiredCarParkingSpaces 79330 non-null int64 28 TotalOfSpecialRequests 79330 non-null int64 29 ReservationStatus 79330 non-null object 30 ReservationStatusDate 79330 non-null object dtypes: float64(2), int64(16), object(13) memory usage: 18.8+ MB
data.isna().sum()
#missing values in "Country" and "Children"
IsCanceled 0 LeadTime 0 ArrivalDateYear 0 ArrivalDateMonth 0 ArrivalDateWeekNumber 0 ArrivalDateDayOfMonth 0 StaysInWeekendNights 0 StaysInWeekNights 0 Adults 0 Children 4 Babies 0 Meal 0 Country 24 MarketSegment 0 DistributionChannel 0 IsRepeatedGuest 0 PreviousCancellations 0 PreviousBookingsNotCanceled 0 ReservedRoomType 0 AssignedRoomType 0 BookingChanges 0 DepositType 0 Agent 0 Company 0 DaysInWaitingList 0 CustomerType 0 ADR 0 RequiredCarParkingSpaces 0 TotalOfSpecialRequests 0 ReservationStatus 0 ReservationStatusDate 0 dtype: int64
data.describe(include = 'all').T #descriptive statistics
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| IsCanceled | 79330.0 | NaN | NaN | NaN | 0.41727 | 0.493111 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| LeadTime | 79330.0 | NaN | NaN | NaN | 109.735724 | 110.948526 | 0.0 | 23.0 | 74.0 | 163.0 | 629.0 |
| ArrivalDateYear | 79330.0 | NaN | NaN | NaN | 2016.174285 | 0.699181 | 2015.0 | 2016.0 | 2016.0 | 2017.0 | 2017.0 |
| ArrivalDateMonth | 79330 | 12 | August | 8983 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ArrivalDateWeekNumber | 79330.0 | NaN | NaN | NaN | 27.177449 | 13.398523 | 1.0 | 17.0 | 27.0 | 38.0 | 53.0 |
| ArrivalDateDayOfMonth | 79330.0 | NaN | NaN | NaN | 15.786625 | 8.728451 | 1.0 | 8.0 | 16.0 | 23.0 | 31.0 |
| StaysInWeekendNights | 79330.0 | NaN | NaN | NaN | 0.795185 | 0.885026 | 0.0 | 0.0 | 1.0 | 2.0 | 16.0 |
| StaysInWeekNights | 79330.0 | NaN | NaN | NaN | 2.182957 | 1.456416 | 0.0 | 1.0 | 2.0 | 3.0 | 41.0 |
| Adults | 79330.0 | NaN | NaN | NaN | 1.850977 | 0.509292 | 0.0 | 2.0 | 2.0 | 2.0 | 4.0 |
| Children | 79326.0 | NaN | NaN | NaN | 0.09137 | 0.372177 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 |
| Babies | 79330.0 | NaN | NaN | NaN | 0.004941 | 0.084323 | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 |
| Meal | 79330 | 4 | BB | 62305 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Country | 79306 | 166 | PRT | 30960 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| MarketSegment | 79330 | 8 | Online TA | 38748 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DistributionChannel | 79330 | 5 | TA/TO | 68945 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| IsRepeatedGuest | 79330.0 | NaN | NaN | NaN | 0.025615 | 0.157983 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| PreviousCancellations | 79330.0 | NaN | NaN | NaN | 0.079743 | 0.415472 | 0.0 | 0.0 | 0.0 | 0.0 | 21.0 |
| PreviousBookingsNotCanceled | 79330.0 | NaN | NaN | NaN | 0.132371 | 1.693411 | 0.0 | 0.0 | 0.0 | 0.0 | 72.0 |
| ReservedRoomType | 79330 | 8 | A | 62595 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| AssignedRoomType | 79330 | 9 | A | 57007 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| BookingChanges | 79330.0 | NaN | NaN | NaN | 0.187369 | 0.60862 | 0.0 | 0.0 | 0.0 | 0.0 | 21.0 |
| DepositType | 79330 | 3 | No Deposit | 66442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Agent | 79330 | 224 | 9 | 31955 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Company | 79330 | 208 | NULL | 75641 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DaysInWaitingList | 79330.0 | NaN | NaN | NaN | 3.226774 | 20.87089 | 0.0 | 0.0 | 0.0 | 0.0 | 391.0 |
| CustomerType | 79330 | 4 | Transient | 59404 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ADR | 79330.0 | NaN | NaN | NaN | 105.304465 | 43.602954 | 0.0 | 79.2 | 99.9 | 126.0 | 5400.0 |
| RequiredCarParkingSpaces | 79330.0 | NaN | NaN | NaN | 0.024367 | 0.154919 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 |
| TotalOfSpecialRequests | 79330.0 | NaN | NaN | NaN | 0.546918 | 0.780776 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 |
| ReservationStatus | 79330 | 3 | Check-Out | 46228 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ReservationStatusDate | 79330 | 864 | 2015-10-21 | 1416 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
data[data.duplicated (keep = 'first')]
#there are duplicated entries
| IsCanceled | LeadTime | ArrivalDateYear | ArrivalDateMonth | ArrivalDateWeekNumber | ArrivalDateDayOfMonth | StaysInWeekendNights | StaysInWeekNights | Adults | Children | ... | DepositType | Agent | Company | DaysInWaitingList | CustomerType | ADR | RequiredCarParkingSpaces | TotalOfSpecialRequests | ReservationStatus | ReservationStatusDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 1 | 62 | 2015 | July | 27 | 2 | 2 | 3 | 2 | 0.0 | ... | No Deposit | 8 | NULL | 0 | Transient | 76.50 | 0 | 1 | No-Show | 2015-07-02 |
| 12 | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | 0.0 | ... | No Deposit | 1 | NULL | 0 | Transient-Party | 86.00 | 0 | 0 | Check-Out | 2015-07-05 |
| 17 | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | 0.0 | ... | No Deposit | 1 | NULL | 0 | Transient-Party | 86.00 | 0 | 0 | Check-Out | 2015-07-05 |
| 18 | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | 0.0 | ... | No Deposit | 1 | NULL | 0 | Transient-Party | 86.00 | 0 | 0 | Check-Out | 2015-07-05 |
| 22 | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | 0.0 | ... | No Deposit | 1 | NULL | 0 | Transient-Party | 86.00 | 0 | 0 | Check-Out | 2015-07-05 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 79289 | 0 | 186 | 2017 | August | 35 | 31 | 0 | 3 | 2 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient | 126.00 | 0 | 2 | Check-Out | 2017-09-03 |
| 79292 | 0 | 63 | 2017 | August | 35 | 31 | 0 | 3 | 3 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient-Party | 195.33 | 0 | 2 | Check-Out | 2017-09-03 |
| 79293 | 0 | 63 | 2017 | August | 35 | 31 | 0 | 3 | 3 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient-Party | 195.33 | 0 | 2 | Check-Out | 2017-09-03 |
| 79294 | 0 | 63 | 2017 | August | 35 | 31 | 0 | 3 | 3 | 0.0 | ... | No Deposit | 9 | NULL | 0 | Transient-Party | 195.33 | 0 | 2 | Check-Out | 2017-09-03 |
| 79313 | 0 | 175 | 2017 | August | 35 | 31 | 1 | 3 | 1 | 0.0 | ... | No Deposit | 42 | NULL | 0 | Transient | 82.35 | 0 | 1 | Check-Out | 2017-09-04 |
25902 rows × 31 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 79330 entries, 0 to 79329 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 IsCanceled 79330 non-null int64 1 LeadTime 79330 non-null int64 2 ArrivalDateYear 79330 non-null int64 3 ArrivalDateMonth 79330 non-null object 4 ArrivalDateWeekNumber 79330 non-null int64 5 ArrivalDateDayOfMonth 79330 non-null int64 6 StaysInWeekendNights 79330 non-null int64 7 StaysInWeekNights 79330 non-null int64 8 Adults 79330 non-null int64 9 Children 79326 non-null float64 10 Babies 79330 non-null int64 11 Meal 79330 non-null object 12 Country 79306 non-null object 13 MarketSegment 79330 non-null object 14 DistributionChannel 79330 non-null object 15 IsRepeatedGuest 79330 non-null int64 16 PreviousCancellations 79330 non-null int64 17 PreviousBookingsNotCanceled 79330 non-null int64 18 ReservedRoomType 79330 non-null object 19 AssignedRoomType 79330 non-null object 20 BookingChanges 79330 non-null int64 21 DepositType 79330 non-null object 22 Agent 79330 non-null object 23 Company 79330 non-null object 24 DaysInWaitingList 79330 non-null int64 25 CustomerType 79330 non-null object 26 ADR 79330 non-null float64 27 RequiredCarParkingSpaces 79330 non-null int64 28 TotalOfSpecialRequests 79330 non-null int64 29 ReservationStatus 79330 non-null object 30 ReservationStatusDate 79330 non-null object dtypes: float64(2), int64(16), object(13) memory usage: 18.8+ MB
#understand if there is seasonality
data['ArrivalDate'] = data['ArrivalDateMonth'] + '-' + data['ArrivalDateYear'].astype(str)
fig = plt.figure(figsize=(20, 8))
# count plot on single categorical variable
sns.countplot(x ='ArrivalDate',hue = 'IsCanceled', data = data)
plt.title ('Distribution of Bookings per month between 2015-2017', fontsize = 20, x = 0.5, y = 1.05)
plt.xticks(rotation = 90, fontsize = 15 )
plt.yticks(fontsize = 15)
plt.ylabel('Count', fontsize = 17)
plt.xlabel('Arrival Date', fontsize = 17)
plt.legend(title = 'IsCanceled', fontsize = 15)
plt.savefig('Distribution_Bookings_between_2015-2017.png', bbox_inches= 'tight' )
plt.show()
#Sorting the records by Year, Month and Day (Arrivals Date)
data['ArrivalDateMonthNumber'] = data['ArrivalDateMonth'].replace("January", "1").replace("February", "2").replace("March", "3").replace("April", "4").replace("May", "5").replace("June", "6").replace("July", "7").replace("August", "8").replace("September", "9").replace("October", "10").replace("November", "11").replace("December", "12")
df = data.groupby(['ArrivalDateYear', 'ArrivalDateMonthNumber','ArrivalDate', 'IsCanceled'])["LeadTime"].count()
df1 = data.groupby('ArrivalDate')["ArrivalDateYear"].count()
proportion = df/df1 * 100
proportion = pd.DataFrame (proportion)
proportion.reset_index(inplace = True)
proportion.set_index (['ArrivalDate'], inplace = True)
proportion['ArrivalDateMonthNumber']= pd.to_numeric(proportion['ArrivalDateMonthNumber'])
#Sorting by arrival date
proportion = proportion.sort_values(by = ['ArrivalDateYear','ArrivalDateMonthNumber'],
ascending = [True, True])
#selecting only the cancellation proportion per month/year
proportion = proportion[proportion['IsCanceled'] == 1]
proportion.reset_index(inplace = True)
fig, ax = plt.subplots(figsize=(20, 8))
plt.plot(proportion['ArrivalDate'] , proportion[0], color='b', marker='o')
plt.title('Cancellation Ratio per month between 2015-2017', fontsize = 20, x = 0.5, y = 1.05)
plt.xlabel('ArrivalDate', fontsize = 17)
plt.ylabel('Proportion', fontsize = 17)
plt.xticks(rotation = 90, fontsize = 15)
plt.yticks(fontsize = 15)
ax.set_ylim(ymin=0)
plt.grid(True)
plt.savefig('Cancellation_Ratio_2015-2017.png', bbox_inches= 'tight' )
plt.show()
data.drop(['ArrivalDate', 'ArrivalDateMonth'], axis = 1, inplace = True)
data['ArrivalDateMonthNumber']= pd.to_numeric(data['ArrivalDateMonthNumber'])
#defining categorical and numeric variables
num = ['LeadTime', 'ArrivalDateYear','ArrivalDateMonthNumber', 'ArrivalDateDayOfMonth', 'ArrivalDateWeekNumber', 'Adults', 'Children','Babies',
'StaysInWeekendNights','PreviousCancellations','PreviousBookingsNotCanceled',
'StaysInWeekNights','BookingChanges','DaysInWaitingList','TotalOfSpecialRequests', 'RequiredCarParkingSpaces', 'ADR' ]
cat = data.columns.drop(num).to_list()
cat.remove('ReservationStatusDate') #it's not useful for predictions: doesn't have any relationship with target
cat.remove('IsCanceled')
#Numeric Variables' Boxplots (initial dataset)
sns.set()
fig, axes = plt.subplots(5,4, figsize=(30, 20))
for ax, feat in zip(axes.flatten(), num):
sns.boxplot(x=data[feat], ax=ax)
title = "Numeric Variables' Box Plots (Initial Dataset)"
plt.suptitle(title, x = 0.5, y = 0.93, fontsize = 25)
plt.show()
# pairplot of numeric features (initial dataset)
sns.pairplot(data[num], diag_kind="hist")
plt.subplots_adjust(top=0.95)
plt.suptitle("Pairwise Relationship of Numeric Variables", fontsize=20)
plt.show()
# All Numeric Variables' Histograms in one figure
sns. set ()
#Prepare figure. Create individual axes where each box plot will be placed
fig, axes = plt.subplots (5, 4, figsize=(30, 20))
# Plot data
# Iterate across axes objects and associate each histogram:
for ax, feat in zip (axes.flatten (), num):
sns.histplot (x=data[feat], ax=ax)
# Layout
# Add a centered title to the figure:
title = "Numeric Variables' Histograms (Initial Dataset)"
plt.suptitle (title, x = 0.5, y = 0.93, fontsize = 25)
plt.subplots_adjust (wspace=0.3)
plt.show()
# categorical features' absolute frequencies
sns.set()
fig, axes = plt.subplots(3,4, figsize=(30, 20))
for ax, feat in zip(axes.flatten(), cat):
sns.countplot(x=data[feat], ax=ax, color='#007acc')
title = "Categorical Variables' Absolute Frequencies"
plt.suptitle(title,x = 0.5, y = 0.9, fontsize = 30)
plt.show()
# All Non-Metric/ Low Cardinality Variables' Relative Frequencies by Target
sns.set()
features = cat.copy()
#Remove high cardinality variables
features.remove('Country')
features.remove('Agent')
features.remove('Company')
# Prepare figure. Create individual axes where each bar plot will be placed
fig, axes = plt.subplots(3,3, figsize=(20, 12))
# Plot data
# Iterate across axes objects and associate each bar plot:
for ax, feat in zip(axes.flatten(), features):
data1 = data.groupby(['IsCanceled'])[feat].value_counts(normalize=True).rename('prop').reset_index()
sns.barplot(data=data1, x=feat, y='prop', hue='IsCanceled',ax=ax)
sns.move_legend(ax, "upper right")
title = "Categorical/Low Cardinality Variables' Relative Frequencies by Target"
plt.suptitle(title, x = 0.5, y = 0.92,fontsize = 20)
# Rotating X-axis labels
axes.flatten()[1].tick_params(axis='x', labelrotation = 90)
plt.subplots_adjust(wspace=0.3, hspace=0.8)
plt.show()
# Prepare figure
fig = plt.figure(figsize=(10, 8))
# Obtain correlation matrix. Round the values to 2 decimal cases. Use the DataFrame corr() and round() method.
corr = np.round(data[num].corr(method="pearson"), decimals=2)
# Build annotation matrix (values above |0.5| will appear annotated in the plot)
mask_annot = np.absolute(corr.values) >= 0.5
annot = np.where(mask_annot, corr.values, np.full(corr.shape,""))
# Plot heatmap of the correlation matrix
sns.heatmap(data=corr, annot=annot, cmap=sns.diverging_palette(220, 10, as_cmap=True),
fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)
# Layout
fig.subplots_adjust(top=0.95)
fig.suptitle("Pearson Correlation Matrix", fontsize=20)
plt.show()
countries = data.groupby(['Country'])['LeadTime'].count()
countries = pd.DataFrame(countries)
countries.reset_index(inplace = True)
countries.rename (columns={'LeadTime': 'NumberBookings'}, inplace = True)
countries = countries.sort_values('NumberBookings', ascending = False ).head(10)
countries
| Country | NumberBookings | |
|---|---|---|
| 125 | PRT | 30960 |
| 50 | FRA | 8804 |
| 39 | DEU | 6084 |
| 53 | GBR | 5315 |
| 46 | ESP | 4611 |
| 75 | ITA | 3307 |
| 14 | BEL | 1894 |
| 23 | BRA | 1794 |
| 158 | USA | 1618 |
| 115 | NLD | 1590 |
colors=['#fae588','#f79d65','#f9dc5c','#e8ac65','#e76f51','#ef233c','#b7094c']
fig = px.treemap(countries, path=['Country'], values=countries['NumberBookings'], width=800, height=400,
title = "Top10 guests' nationalities",
color=None, hover_data = countries,
color_continuous_scale= None
)
fig.update_layout(
margin = dict(t=50, l=25, r=25, b=25))
fig.show('notebook')
#Get the number of customers by country
country_customers = data['Country'].value_counts()
#Using plotly to show customers in world map
fig = go.Figure(data=go.Choropleth(
locations = country_customers.index, #Define the country
z = country_customers.values, #Define the values to each country
colorscale = 'magma',
autocolorscale=False,
reversescale=True,
marker_line_color='darkgray',
marker_line_width=0.5,
colorbar_title = 'Bookings',
))
layout = fig.update_layout(
title_text='Density of Bookings By Country',
geo=dict(
showframe=True,
showcoastlines=False,
projection_type='equirectangular'
),
)
fig_choroplethmap = go.Figure(fig, layout)
fig_choroplethmap_= fig_choroplethmap.show(renderer='browser') #plot in browser to better visualization
The Data Preparation phase includes all of the tasks to prepare the dataset that are going to be applied in the next step. Some of the tasks are: outliers' removal, variables' scaling, missing values' imputation and feature engineering
data2 = data.copy()
#remove the duplicated entries
data2.drop_duplicates(keep = 'first', inplace = True)
data2.loc[data2.duplicated(keep = 'first'), :]
| IsCanceled | LeadTime | ArrivalDateYear | ArrivalDateWeekNumber | ArrivalDateDayOfMonth | StaysInWeekendNights | StaysInWeekNights | Adults | Children | Babies | ... | Agent | Company | DaysInWaitingList | CustomerType | ADR | RequiredCarParkingSpaces | TotalOfSpecialRequests | ReservationStatus | ReservationStatusDate | ArrivalDateMonthNumber |
|---|
0 rows × 31 columns
data2.reset_index(inplace = True)
data2.drop('index', axis = 1, inplace = True)
data2
| IsCanceled | LeadTime | ArrivalDateYear | ArrivalDateWeekNumber | ArrivalDateDayOfMonth | StaysInWeekendNights | StaysInWeekNights | Adults | Children | Babies | ... | Agent | Company | DaysInWaitingList | CustomerType | ADR | RequiredCarParkingSpaces | TotalOfSpecialRequests | ReservationStatus | ReservationStatusDate | ArrivalDateMonthNumber | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 6 | 2015 | 27 | 1 | 0 | 2 | 1 | 0.0 | 0 | ... | 6 | NULL | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-03 | 7 |
| 1 | 1 | 88 | 2015 | 27 | 1 | 0 | 4 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 76.50 | 0 | 1 | Canceled | 2015-07-01 | 7 |
| 2 | 1 | 65 | 2015 | 27 | 1 | 0 | 4 | 1 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 68.00 | 0 | 1 | Canceled | 2015-04-30 | 7 |
| 3 | 1 | 92 | 2015 | 27 | 1 | 2 | 4 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 76.50 | 0 | 2 | Canceled | 2015-06-23 | 7 |
| 4 | 1 | 100 | 2015 | 27 | 2 | 0 | 2 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 76.50 | 0 | 1 | Canceled | 2015-04-02 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53423 | 0 | 23 | 2017 | 35 | 30 | 2 | 5 | 2 | 0.0 | 0 | ... | 394 | NULL | 0 | Transient | 96.14 | 0 | 0 | Check-Out | 2017-09-06 | 8 |
| 53424 | 0 | 102 | 2017 | 35 | 31 | 2 | 5 | 3 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 225.43 | 0 | 2 | Check-Out | 2017-09-07 | 8 |
| 53425 | 0 | 34 | 2017 | 35 | 31 | 2 | 5 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 157.71 | 0 | 4 | Check-Out | 2017-09-07 | 8 |
| 53426 | 0 | 109 | 2017 | 35 | 31 | 2 | 5 | 2 | 0.0 | 0 | ... | 89 | NULL | 0 | Transient | 104.40 | 0 | 0 | Check-Out | 2017-09-07 | 8 |
| 53427 | 0 | 205 | 2017 | 35 | 29 | 2 | 7 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 151.20 | 0 | 2 | Check-Out | 2017-09-07 | 8 |
53428 rows × 31 columns
#we still need to remove these rows because they don't provide us any information (don't represent bookings)
data2[((data2['Adults'] == 0) & (data2['Children'] == 0) & (data2['Babies'] == 0)) == True]
| IsCanceled | LeadTime | ArrivalDateYear | ArrivalDateWeekNumber | ArrivalDateDayOfMonth | StaysInWeekendNights | StaysInWeekNights | Adults | Children | Babies | ... | Agent | Company | DaysInWaitingList | CustomerType | ADR | RequiredCarParkingSpaces | TotalOfSpecialRequests | ReservationStatus | ReservationStatusDate | ArrivalDateMonthNumber | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 167 | 0 | 132 | 2015 | 30 | 23 | 2 | 5 | 0 | 0.0 | 0 | ... | 1 | NULL | 0 | Transient-Party | 0.00 | 0 | 0 | Check-Out | 2015-07-30 | 7 |
| 433 | 0 | 0 | 2015 | 33 | 9 | 2 | 0 | 0 | 0.0 | 0 | ... | 9 | NULL | 0 | Contract | 0.00 | 0 | 0 | Check-Out | 2015-08-11 | 8 |
| 507 | 0 | 0 | 2015 | 33 | 10 | 9 | 20 | 0 | 0.0 | 0 | ... | NULL | 47 | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-09-08 | 8 |
| 541 | 0 | 0 | 2015 | 33 | 11 | 0 | 4 | 0 | 0.0 | 0 | ... | NULL | 49 | 0 | Transient-Party | 0.00 | 0 | 1 | Check-Out | 2015-08-15 | 8 |
| 1401 | 1 | 1 | 2015 | 37 | 6 | 2 | 0 | 0 | 0.0 | 0 | ... | 1 | NULL | 0 | Transient-Party | 0.00 | 0 | 0 | Canceled | 2015-09-05 | 9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 49760 | 0 | 107 | 2017 | 26 | 27 | 0 | 3 | 0 | 0.0 | 0 | ... | 7 | NULL | 0 | Transient | 100.80 | 0 | 0 | Check-Out | 2017-06-30 | 6 |
| 49812 | 0 | 1 | 2017 | 26 | 30 | 0 | 1 | 0 | 0.0 | 0 | ... | NULL | NULL | 0 | Transient | 0.00 | 1 | 1 | Check-Out | 2017-07-01 | 6 |
| 50746 | 0 | 44 | 2017 | 28 | 15 | 1 | 1 | 0 | 0.0 | 0 | ... | 425 | NULL | 0 | Transient | 73.80 | 0 | 0 | Check-Out | 2017-07-17 | 7 |
| 51002 | 0 | 2 | 2017 | 28 | 15 | 2 | 5 | 0 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient-Party | 22.86 | 0 | 1 | Check-Out | 2017-07-22 | 7 |
| 51465 | 0 | 170 | 2017 | 30 | 27 | 0 | 2 | 0 | 0.0 | 0 | ... | 52 | NULL | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2017-07-29 | 7 |
154 rows × 31 columns
bookings = data2.drop(data2.index[((data2['Adults'] == 0) & (data2['Children'] == 0) & (data2['Babies'] == 0)) == True].tolist())
bookings.reset_index(inplace = True)
bookings.drop('index', axis = 1, inplace = True)
bookings
| IsCanceled | LeadTime | ArrivalDateYear | ArrivalDateWeekNumber | ArrivalDateDayOfMonth | StaysInWeekendNights | StaysInWeekNights | Adults | Children | Babies | ... | Agent | Company | DaysInWaitingList | CustomerType | ADR | RequiredCarParkingSpaces | TotalOfSpecialRequests | ReservationStatus | ReservationStatusDate | ArrivalDateMonthNumber | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 6 | 2015 | 27 | 1 | 0 | 2 | 1 | 0.0 | 0 | ... | 6 | NULL | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-03 | 7 |
| 1 | 1 | 88 | 2015 | 27 | 1 | 0 | 4 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 76.50 | 0 | 1 | Canceled | 2015-07-01 | 7 |
| 2 | 1 | 65 | 2015 | 27 | 1 | 0 | 4 | 1 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 68.00 | 0 | 1 | Canceled | 2015-04-30 | 7 |
| 3 | 1 | 92 | 2015 | 27 | 1 | 2 | 4 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 76.50 | 0 | 2 | Canceled | 2015-06-23 | 7 |
| 4 | 1 | 100 | 2015 | 27 | 2 | 0 | 2 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 76.50 | 0 | 1 | Canceled | 2015-04-02 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53269 | 0 | 23 | 2017 | 35 | 30 | 2 | 5 | 2 | 0.0 | 0 | ... | 394 | NULL | 0 | Transient | 96.14 | 0 | 0 | Check-Out | 2017-09-06 | 8 |
| 53270 | 0 | 102 | 2017 | 35 | 31 | 2 | 5 | 3 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 225.43 | 0 | 2 | Check-Out | 2017-09-07 | 8 |
| 53271 | 0 | 34 | 2017 | 35 | 31 | 2 | 5 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 157.71 | 0 | 4 | Check-Out | 2017-09-07 | 8 |
| 53272 | 0 | 109 | 2017 | 35 | 31 | 2 | 5 | 2 | 0.0 | 0 | ... | 89 | NULL | 0 | Transient | 104.40 | 0 | 0 | Check-Out | 2017-09-07 | 8 |
| 53273 | 0 | 205 | 2017 | 35 | 29 | 2 | 7 | 2 | 0.0 | 0 | ... | 9 | NULL | 0 | Transient | 151.20 | 0 | 2 | Check-Out | 2017-09-07 | 8 |
53274 rows × 31 columns
bookings1 = bookings.copy()
bookings1.median(numeric_only=True)
IsCanceled 0.0 LeadTime 51.0 ArrivalDateYear 2016.0 ArrivalDateWeekNumber 27.0 ArrivalDateDayOfMonth 16.0 StaysInWeekendNights 1.0 StaysInWeekNights 2.0 Adults 2.0 Children 0.0 Babies 0.0 IsRepeatedGuest 0.0 PreviousCancellations 0.0 PreviousBookingsNotCanceled 0.0 BookingChanges 0.0 DaysInWaitingList 0.0 ADR 105.3 RequiredCarParkingSpaces 0.0 TotalOfSpecialRequests 1.0 ArrivalDateMonthNumber 7.0 dtype: float64
modes = bookings1[cat].mode().loc[0]
modes
Meal BB Country PRT MarketSegment Online TA DistributionChannel TA/TO IsRepeatedGuest 0 ReservedRoomType A AssignedRoomType A DepositType No Deposit Agent 9 Company NULL CustomerType Transient ReservationStatus Check-Out Name: 0, dtype: object
bookings1.fillna(bookings1.median(), inplace=True)
bookings1.fillna(modes, inplace=True)
bookings1.isna().sum() # checking how many NaNs we still have
C:\Users\andre\AppData\Local\Temp/ipykernel_103112/3582061376.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
IsCanceled 0 LeadTime 0 ArrivalDateYear 0 ArrivalDateWeekNumber 0 ArrivalDateDayOfMonth 0 StaysInWeekendNights 0 StaysInWeekNights 0 Adults 0 Children 0 Babies 0 Meal 0 Country 0 MarketSegment 0 DistributionChannel 0 IsRepeatedGuest 0 PreviousCancellations 0 PreviousBookingsNotCanceled 0 ReservedRoomType 0 AssignedRoomType 0 BookingChanges 0 DepositType 0 Agent 0 Company 0 DaysInWaitingList 0 CustomerType 0 ADR 0 RequiredCarParkingSpaces 0 TotalOfSpecialRequests 0 ReservationStatus 0 ReservationStatusDate 0 ArrivalDateMonthNumber 0 dtype: int64
# change data types of some variables
bookings1['Children'] = bookings1['Children'].astype(int)
bookings1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 53274 entries, 0 to 53273 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 IsCanceled 53274 non-null int64 1 LeadTime 53274 non-null int64 2 ArrivalDateYear 53274 non-null int64 3 ArrivalDateWeekNumber 53274 non-null int64 4 ArrivalDateDayOfMonth 53274 non-null int64 5 StaysInWeekendNights 53274 non-null int64 6 StaysInWeekNights 53274 non-null int64 7 Adults 53274 non-null int64 8 Children 53274 non-null int32 9 Babies 53274 non-null int64 10 Meal 53274 non-null object 11 Country 53274 non-null object 12 MarketSegment 53274 non-null object 13 DistributionChannel 53274 non-null object 14 IsRepeatedGuest 53274 non-null int64 15 PreviousCancellations 53274 non-null int64 16 PreviousBookingsNotCanceled 53274 non-null int64 17 ReservedRoomType 53274 non-null object 18 AssignedRoomType 53274 non-null object 19 BookingChanges 53274 non-null int64 20 DepositType 53274 non-null object 21 Agent 53274 non-null object 22 Company 53274 non-null object 23 DaysInWaitingList 53274 non-null int64 24 CustomerType 53274 non-null object 25 ADR 53274 non-null float64 26 RequiredCarParkingSpaces 53274 non-null int64 27 TotalOfSpecialRequests 53274 non-null int64 28 ReservationStatus 53274 non-null object 29 ReservationStatusDate 53274 non-null object 30 ArrivalDateMonthNumber 53274 non-null int64 dtypes: float64(1), int32(1), int64(17), object(12) memory usage: 12.4+ MB
#Numeric Variables' Boxplots
sns.set()
fig, axes = plt.subplots(5,4, figsize=(30, 30))
for ax, feat in zip(axes.flatten(), num):
sns.boxplot(x=bookings1[feat], ax=ax)
title = "Numeric Variables' Box Plots"
plt.suptitle(title, x = 0.5, y = 0.91, fontsize = 25)
plt.show()
bookings1[bookings1['ADR'] > 5000] #outlier
| IsCanceled | LeadTime | ArrivalDateYear | ArrivalDateWeekNumber | ArrivalDateDayOfMonth | StaysInWeekendNights | StaysInWeekNights | Adults | Children | Babies | ... | Agent | Company | DaysInWaitingList | CustomerType | ADR | RequiredCarParkingSpaces | TotalOfSpecialRequests | ReservationStatus | ReservationStatusDate | ArrivalDateMonthNumber | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4770 | 1 | 35 | 2016 | 13 | 25 | 0 | 1 | 2 | 0 | 0 | ... | 12 | NULL | 0 | Transient | 5400.0 | 0 | 0 | Canceled | 2016-02-19 | 3 |
1 rows × 31 columns
bookings1[bookings1['Babies'] > 4]
| IsCanceled | LeadTime | ArrivalDateYear | ArrivalDateWeekNumber | ArrivalDateDayOfMonth | StaysInWeekendNights | StaysInWeekNights | Adults | Children | Babies | ... | Agent | Company | DaysInWaitingList | CustomerType | ADR | RequiredCarParkingSpaces | TotalOfSpecialRequests | ReservationStatus | ReservationStatusDate | ArrivalDateMonthNumber | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3321 | 0 | 37 | 2016 | 3 | 12 | 0 | 2 | 2 | 0 | 10 | ... | 9 | NULL | 0 | Transient | 84.45 | 0 | 1 | Check-Out | 2016-01-14 | 1 |
| 20486 | 0 | 11 | 2015 | 42 | 11 | 2 | 1 | 1 | 0 | 9 | ... | 95 | NULL | 0 | Transient-Party | 95.00 | 0 | 0 | Check-Out | 2015-10-14 | 10 |
2 rows × 31 columns
bookings1[(bookings1['Babies'] > 4) & (bookings1['Adults'] <= 2)] #outlier
| IsCanceled | LeadTime | ArrivalDateYear | ArrivalDateWeekNumber | ArrivalDateDayOfMonth | StaysInWeekendNights | StaysInWeekNights | Adults | Children | Babies | ... | Agent | Company | DaysInWaitingList | CustomerType | ADR | RequiredCarParkingSpaces | TotalOfSpecialRequests | ReservationStatus | ReservationStatusDate | ArrivalDateMonthNumber | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3321 | 0 | 37 | 2016 | 3 | 12 | 0 | 2 | 2 | 0 | 10 | ... | 9 | NULL | 0 | Transient | 84.45 | 0 | 1 | Check-Out | 2016-01-14 | 1 |
| 20486 | 0 | 11 | 2015 | 42 | 11 | 2 | 1 | 1 | 0 | 9 | ... | 95 | NULL | 0 | Transient-Party | 95.00 | 0 | 0 | Check-Out | 2015-10-14 | 10 |
2 rows × 31 columns
#Manual Filtering
filters = (
(bookings1['Babies']<4)
&
(bookings1['ADR']<5000)
)
bookings_clean = bookings1[filters]
print('Percentage of data kept:', np.round(bookings_clean.shape[0] / bookings1.shape[0], 4))
Percentage of data kept: 0.9999
#Numeric Variables' Boxplots after Outliers' Removal
sns.set()
fig, axes = plt.subplots(5,4, figsize=(30, 30))
for ax, feat in zip(axes.flatten(), num):
sns.boxplot(x=bookings_clean[feat], ax=ax)
title = "Numeric Variables' Box Plots after Outliers'Removal"
plt.suptitle(title, x = 0.5, y = 0.91, fontsize = 25)
plt.show()
# All Numeric Variables' Histograms in one figure
sns. set ()
#Prepare figure. Create individual axes where each box plot will be placed
fig, axes = plt.subplots (4, 5, figsize=(30, 20))
# Plot data
# Iterate across axes objects and associate each histogram:
for ax, feat in zip (axes.flatten (), num):
sns.histplot (x=bookings_clean[feat], ax=ax)
# Layout
# Add a centered title to the figure:
title = "Numeric Variables' Histograms"
plt.suptitle (title, x = 0.5, y = 0.93, fontsize = 30)
plt.subplots_adjust (wspace=0.3)
plt.show()
# All Non-Metric/ Low Cardinality Variables' Relative Frequencies by Target
sns.set()
# Prepare figure. Create individual axes where each bar plot will be placed
fig, axes = plt.subplots(3,3, figsize=(30, 20))
# Plot data
# Iterate across axes objects and associate each bar plot:
for ax, feat in zip(axes.flatten(), features):
data1 = bookings_clean.groupby(['IsCanceled'])[feat].value_counts(normalize=True).rename('prop').reset_index()
sns.barplot(data=data1, x=feat, y='prop', hue='IsCanceled', ax=ax)
sns.move_legend(ax, "upper right")
title = "Categorical/Low Cardinality Variables' Relative Frequencies by Target"
plt.suptitle(title, fontsize = 25, x = 0.5, y = 0.95)
# Rotating X-axis labels
axes.flatten()[1].tick_params(axis='x', labelrotation = 90)
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()
bookings2 = bookings_clean.copy()
bookings2.reset_index(inplace = True)
bookings2.drop('index', axis = 1, inplace = True)
#creating "Kids" (numeric variable) which represents the sum of children and babies
bookings2['Kids'] = bookings2['Children'] + bookings2['Babies']
bookings2.drop(['Children','Babies'] , axis = 1, inplace = True)
#creating "RoomChange" (categorical variable) which is important to understand if the room that was reserved is the same
#as the one assigned to the booking
bookings2[['ReservedRoomType', 'AssignedRoomType', 'IsCanceled']].head()
| ReservedRoomType | AssignedRoomType | IsCanceled | |
|---|---|---|---|
| 0 | A | A | 0 |
| 1 | A | A | 1 |
| 2 | A | A | 1 |
| 3 | A | A | 1 |
| 4 | A | A | 1 |
def roomchange_function (x, y):
list1 = bookings2[x].tolist()
list2 = bookings2[y].tolist()
room_change = []
for w,z in zip(list1, list2):
if w == z:
result = 'No'
room_change.append(result)
else:
result = 'Yes'
room_change.append(result)
return room_change
bookings2['RoomChange'] = roomchange_function ('ReservedRoomType', 'AssignedRoomType')
# aggregation of Country categories into top 10 + "Other": PRT, FRA, GBR, DEU, ESP, ITA, BEL, BRA, NLD, USA and Other
for i in [i for i in list(bookings2['Country'].unique()) if i not in ('PRT', 'FRA', 'GBR', 'DEU', 'ESP', 'ITA', 'BEL', 'BRA', 'NLD', 'USA')]:
bookings2.loc[bookings2[bookings2['Country'] == i].index, 'Country'] = 'Other'
bookings2['Country'].value_counts()
PRT 13125 Other 11224 FRA 7399 GBR 4519 DEU 4465 ESP 3777 ITA 2642 BEL 1690 BRA 1587 NLD 1430 USA 1413 Name: Country, dtype: int64
# aggregation of Agent categories into 9, NULL or Other
for i in [i for i in list(bookings2['Agent'].unique()) if i not in (' 9', ' NULL')]:
bookings2.loc[bookings2[bookings2['Agent'] == i].index, 'Agent'] = 'Other'
bookings2['Agent'].value_counts()
9 28714
Other 19014
NULL 5543
Name: Agent, dtype: int64
# aggregation of Company categories into Company or Null
for i in [i for i in list(bookings2['Company'].unique()) if i not in (' NULL')]:
bookings2.loc[bookings2[bookings2['Company'] == i].index, 'Company'] = 'Company'
bookings2['Company'].value_counts()
NULL 50506 Company 2765 Name: Company, dtype: int64
# aggregation of MarketSegment categories into Online TA, Offline TA/TO, Direct and Other
for i in [i for i in list(bookings2['MarketSegment'].unique()) if i not in ('Online TA','Offline TA/TO', 'Direct', 'Corporate', 'Groups')]:
bookings2.loc[bookings2[bookings2['MarketSegment'] == i].index, 'MarketSegment'] = 'Other'
bookings2['MarketSegment'].value_counts()
Online TA 34928 Offline TA/TO 7238 Direct 5538 Groups 2619 Corporate 2217 Other 731 Name: MarketSegment, dtype: int64
# aggregation of DistributionChannel categories into TA/TO, Direct and Other
for i in [i for i in list(bookings2['DistributionChannel'].unique()) if i not in ('TA/TO', 'Direct')]:
bookings2.loc[bookings2[bookings2['DistributionChannel'] == i].index, 'DistributionChannel'] = 'Other'
bookings2['DistributionChannel'].value_counts()
TA/TO 44440 Direct 6056 Other 2775 Name: DistributionChannel, dtype: int64
#Updating num list after Feature Engineering
num_out = ['Children', 'Babies']
#'StaysInWeekendNights', 'StaysInWeekNights',
for x in num_out:
num.remove(x)
num_in = ['Kids']
#'LengthStay'
for x in num_in:
num.append(x)
#Updating cat list after Feature Engineering
cat_out = ['ReservedRoomType', 'AssignedRoomType']
for x in cat_out:
cat.remove(x)
cat_in = ['RoomChange']
for x in cat_in:
cat.append(x)
#Updating features list (to visualize low cardinality variables)
features_out = ['ReservedRoomType', 'AssignedRoomType']
for x in features_out:
features.remove(x)
features_in = ['RoomChange', 'Country']
for x in features_in:
features.append(x)
# All Non-Metric/ Low Cardinality Variables' Relative Frequencies by Target
sns.set()
# Prepare figure. Create individual axes where each bar plot will be placed
fig, axes = plt.subplots(4,3, figsize=(20, 15))
# Plot data
# Iterate across axes objects and associate each bar plot:
for ax, feat in zip(axes.flatten(), cat):
data1 = bookings2.groupby(['IsCanceled'])[feat].value_counts(normalize=True).rename('prop').reset_index()
sns.barplot(data=data1, x=feat, y='prop', hue='IsCanceled', ax=ax)
sns.move_legend(ax, "upper right")
title = "Categorical/Low Cardinality Variables' Relative Frequencies by Target"
plt.suptitle(title, x = 0.5, y = 0.92)
# Rotating X-axis labels
axes.flatten()[2].tick_params(axis='x', labelrotation = 90)
plt.subplots_adjust(wspace=0.2, hspace=0.9)
plt.show()
num
['LeadTime', 'ArrivalDateYear', 'ArrivalDateMonthNumber', 'ArrivalDateDayOfMonth', 'ArrivalDateWeekNumber', 'Adults', 'StaysInWeekendNights', 'PreviousCancellations', 'PreviousBookingsNotCanceled', 'StaysInWeekNights', 'BookingChanges', 'DaysInWaitingList', 'TotalOfSpecialRequests', 'RequiredCarParkingSpaces', 'ADR', 'Kids']
remove_num = ['ArrivalDateYear', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 'ArrivalDateMonthNumber']
for x in remove_num:
num.remove(x)
cat.remove('ReservationStatus') #we need to remove this variable, because it can influence the model's performance
# One hot encode categorical features
bookings3 = bookings2.copy()
#selecting non high cardinality variables to encode
ohc_features = cat.copy()
ohc = OneHotEncoder(sparse=False)
df_ohc = pd.DataFrame(ohc.fit_transform(bookings3[ohc_features]),
index=bookings3.index,
columns=ohc.get_feature_names(ohc_features))
scaler = MinMaxScaler().fit(bookings2[num])
# scale num features
bookings3 = pd.DataFrame(scaler.transform(bookings3[num]) , columns=bookings3[num].columns).set_index(bookings3[num].index)
bookings3 = pd.concat([bookings3[num], df_ohc, bookings2['IsCanceled']], axis=1)
bookings3.head()
C:\Users\andre\anaconda3\envs\machinelearning\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
| LeadTime | Adults | StaysInWeekendNights | PreviousCancellations | PreviousBookingsNotCanceled | StaysInWeekNights | BookingChanges | DaysInWaitingList | TotalOfSpecialRequests | RequiredCarParkingSpaces | ... | Agent_Other | Company_ NULL | Company_Company | CustomerType_Contract | CustomerType_Group | CustomerType_Transient | CustomerType_Transient-Party | RoomChange_No | RoomChange_Yes | IsCanceled | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.009539 | 0.25 | 0.000000 | 0.0 | 0.0 | 0.058824 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0 |
| 1 | 0.139905 | 0.50 | 0.000000 | 0.0 | 0.0 | 0.117647 | 0.0 | 0.0 | 0.2 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1 |
| 2 | 0.103339 | 0.25 | 0.000000 | 0.0 | 0.0 | 0.117647 | 0.0 | 0.0 | 0.2 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1 |
| 3 | 0.146264 | 0.50 | 0.142857 | 0.0 | 0.0 | 0.117647 | 0.0 | 0.0 | 0.4 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1 |
| 4 | 0.158983 | 0.50 | 0.000000 | 0.0 | 0.0 | 0.058824 | 0.0 | 0.0 | 0.2 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1 |
5 rows × 53 columns
bookings3.to_csv("bookings3.csv", index=True)
final_dataset = bookings3.copy()
final_dataset['IsCanceled'].value_counts()
0 37237 1 16034 Name: IsCanceled, dtype: int64
(16034 / (37237 + 16034)) #percentage of Cancellations in the total of Bookings
0.30098928122242874
# pairplot of numeric features
sns.pairplot(final_dataset[num], diag_kind="hist")
plt.subplots_adjust(top=0.95)
plt.suptitle("Pairwise Relationship of Selected Numeric Variables for Modeling", fontsize=20)
plt.show()
# Prepare figure
fig = plt.figure(figsize=(10, 8))
# Obtain correlation matrix. Round the values to 2 decimal cases. Use the DataFrame corr() and round() method.
corr = np.round(final_dataset[num].corr(method="pearson"), decimals=2)
# Build annotation matrix (values above |0.5| will appear annotated in the plot)
mask_annot = np.absolute(corr.values) >= 0.5
annot = np.where(mask_annot, corr.values, np.full(corr.shape,""))
# Plot heatmap of the correlation matrix
sns.heatmap(data=corr, annot=annot, cmap=sns.diverging_palette(220, 10, as_cmap=True),
fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)
# Layout
fig.subplots_adjust(top=0.95)
fig.suptitle("Pearson Correlation Matrix", fontsize=20)
plt.show()
final_dataset.to_csv('finaldataset.csv', index=True)